Scraping the data

We’ll be scraping the Onion Date Wise market arrival data from NHRDF website. For this two forms are submitted for years 2016 and 2017. The reason why I am using two form submission is to phase the data ingestion in stages as NHRDF is quite slow when a post request is submitted.

# Loading the required libraries 
library(rvest)
## Loading required package: xml2
library(tidyr)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggmap)
library(prophet)
## Loading required package: Rcpp
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#  The URL to create an html session
url = 'http://nhrdf.org/en-us/DailyWiseMarketArrivals'
nhrdf.session =  url %>% html_session()
nhrdf.form =nhrdf.session %>% html_form()

Now we have the form.Even though the form gives us options to choose by name, inspecting the html clearly shows that the we need to provide value for each one of the fields. Leaving them blank (for month, year and market) makes it equal to all.

Note: For Onion, crop=1

# Filling the values to submit the form

year = c(2016,2017)
crop = 1
month =""
market =""

We will first scrape the data for year 2016

nhrdf.form.filled.2016 = set_values(nhrdf.form[[1]],
           'dnn$dnnLANG$selectCulture' = "en-US",
           'dnn$ctr966$DailyWiseMarketArrivals$Crop' = crop,
           'dnn$ctr966$DailyWiseMarketArrivals$Year'= year[1],
           'dnn$ctr966$DailyWiseMarketArrivals$Market' = market,
           'dnn$ctr966$DailyWiseMarketArrivals$MonthName' = month)

nhrdf.submit.2016 = submit_form(nhrdf.session, nhrdf.form.filled.2016)
## Submitting with 'query'
nhrdf.html.out.2016 = read_html(nhrdf.submit.2016)

nhrdf.table.2016 = nhrdf.html.out.2016 %>% 
    html_node("#dnn_ctr966_DailyWiseMarketArrivals_GridView1")  %>%
    html_table()
head(nhrdf.table.2016)
##          Date     Market Arrival(q) Price Minimum (Rs/q)
## 1 10/Jun/2016 ABOHAR(PB)        350                  450
## 2 28/Jun/2016 ABOHAR(PB)        300                  540
## 3 29/Jun/2016 ABOHAR(PB)         30                  550
## 4 30/Jun/2016 ABOHAR(PB)         30                  550
## 5 12/Aug/2016 ABOHAR(PB)         60                  650
## 6 12/Sep/2016 ABOHAR(PB)        400                  650
##   Price Maximum (Rs/q) Modal Price (Rs/q)
## 1                  650                550
## 2                  670                600
## 3                  850                650
## 4                  850                650
## 5                 1065                850
## 6                  800                725

Same thing for year 2017

nhrdf.form.filled.2017 = set_values(nhrdf.form[[1]],
           'dnn$dnnLANG$selectCulture' = "en-US",
           'dnn$ctr966$DailyWiseMarketArrivals$Crop' = crop,
           'dnn$ctr966$DailyWiseMarketArrivals$Year'= year[2],
           'dnn$ctr966$DailyWiseMarketArrivals$Market' = market,
           'dnn$ctr966$DailyWiseMarketArrivals$MonthName' = month)

nhrdf.submit.2017 = submit_form(nhrdf.session, nhrdf.form.filled.2017)
## Submitting with 'query'
nhrdf.html.out.2017 = read_html(nhrdf.submit.2017)

nhrdf.table.2017= nhrdf.html.out.2017 %>% 
    html_node("#dnn_ctr966_DailyWiseMarketArrivals_GridView1")  %>%
    html_table()
head(nhrdf.table.2017)
##          Date     Market Arrival(q) Price Minimum (Rs/q)
## 1 02/Jan/2017 ABOHAR(PB)        200                  750
## 2 02/Jan/2017   AGRA(UP)       2850                  600
## 3 03/Jan/2017   AGRA(UP)       2950                  800
## 4 04/Jan/2017   AGRA(UP)       3400                  780
## 5 06/Jan/2017   AGRA(UP)       3800                  750
## 6 07/Jan/2017   AGRA(UP)       3700                  750
##   Price Maximum (Rs/q) Modal Price (Rs/q)
## 1                 1000                850
## 2                  925                850
## 3                  900                840
## 4                  900                830
## 5                  850                800
## 6                  850                810

Save these two dataframes to CSV files so that we don’t have to scrape everytime. This is true for atleast 2016 data.

write.csv(nhrdf.table.2016,"DailyMarket2016.csv")
write.csv(nhrdf.table.2017,"DailyMarket2017.csv")

We’ll now read the csv and combine these two into one single dataframe

# Read the csv files
df1= read.csv('DailyMarket2016.csv')
df2 = read.csv('DailyMarket2017.csv')


#  Remove the last line for totals as it is inappropriate
df1 = df1 %>% filter(Market != 'Total')
df2 = df2%>% filter(Market != 'Total')

df.combine = bind_rows(df1,df2)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
str(df.combine)
## 'data.frame':    26106 obs. of  7 variables:
##  $ X                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                : chr  "10/Jun/2016" "28/Jun/2016" "29/Jun/2016" "30/Jun/2016" ...
##  $ Market              : chr  "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
##  $ Arrival.q.          : int  350 300 30 30 60 400 150 200 200 40 ...
##  $ Price.Minimum..Rs.q.: chr  "450" "540" "550" "550" ...
##  $ Price.Maximum..Rs.q.: chr  "650" "670" "850" "850" ...
##  $ Modal.Price..Rs.q.  : chr  "550" "600" "650" "650" ...

After having a look a the structure, it seems that the Column names are quite kludgy and the date and Price columns need to be typecasted and formatted properly

#Drop column X as it is of no use
df.combine$X = NULL

# New column names
col = c('Date','Market','Quantity','PriceMin','PriceMax','PriceMod')

colnames(df.combine) = col

# Change the Price to numeric type
df.combine$PriceMin =as.numeric(df.combine$PriceMin)
df.combine$PriceMax =as.numeric(df.combine$PriceMax)
df.combine$PriceMod =as.numeric(df.combine$PriceMod)

# Change the date format
df.combine$Date = as.Date(df.combine$Date,format ="%d/%b/%Y")

str(df.combine)
## 'data.frame':    26106 obs. of  6 variables:
##  $ Date    : Date, format: "2016-06-10" "2016-06-28" ...
##  $ Market  : chr  "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
##  $ Quantity: int  350 300 30 30 60 400 150 200 200 40 ...
##  $ PriceMin: num  450 540 550 550 650 650 750 650 650 750 ...
##  $ PriceMax: num  650 670 850 850 1065 ...
##  $ PriceMod: num  550 600 650 650 850 725 850 850 700 850 ...

The prediction needs to be based on State but we can see that the city and states are stacked together in Market column. Infact the characters within the bracket are state names. We need to seperate this into two columns.

# Creating two new columns - City and State
df.combine = df.combine%>% 
  mutate(market1=Market) %>% 
  separate(market1,c('City','State'),sep="\\(")
## Warning: Too many values at 667 locations: 4479, 4480, 4481, 4482, 4483,
## 4484, 4485, 4486, 4487, 4488, 4489, 4490, 4491, 4492, 4493, 4494, 4495,
## 4496, 4497, 4498, ...
## Warning: Too few values at 4141 locations: 1898, 1899, 1900, 1901, 1902,
## 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914,
## 1915, 1916, 1917, ...
# Don't need the ending parenthesis
df.combine$State = df.combine$State%>% str_replace("\\)","")
df.combine$State = toupper(df.combine$State)
df.combine= df.combine%>% arrange(City,Date)

# Also, we can drop the market column
df.combine$Market = NULL

# Check how many states we have
unique(df.combine$State)
##  [1] "PB"        "UP"        "GUJ"       "MS"        "OR"       
##  [6] "RAJ"       "WB"        NA          "KNT"       "BHR"      
## [11] "TELANGANA" "KER"       "TN "       "UTT"       "OTHERS"   
## [16] "MP"        "TN"        "HR"        "AS"        "HP"       
## [21] "AP"        "M.P."      "RJ"        "CHATT"     "CHGARH"   
## [26] "F&V "

We can see a couple of issues

  1. MP is represented twice (MP and M.P.)
  2. We have cities where no state(NA) was mentioned, so R filled those will NULL.

Resolving issue one

# Madhya Pradesh is represented twice (MP & M.P.)
df.combine$State = df.combine$State%>% str_replace("M.P.","MP")
unique(df.combine$State)
##  [1] "PB"        "UP"        "GUJ"       "MS"        "OR"       
##  [6] "RAJ"       "WB"        NA          "KNT"       "BHR"      
## [11] "TELANGANA" "KER"       "TN "       "UTT"       "OTHERS"   
## [16] "MP"        "TN"        "HR"        "AS"        "HP"       
## [21] "AP"        "RJ"        "CHATT"     "CHGARH"    "F&V "

For second issue we can code a get_state function like below which takes in a city and returns a State

Get_State = function(City){
  geo_city = geocode(City)
  State = revgeocode(as.numeric(geo_city),output='more')
  return(State$administrative_area_level_1)
}

State = Get_State("AGRA")
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=AGRA&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?latlng=27.1766701,78.0080745&sensor=false
State
## [1] Uttar Pradesh
## Levels: Uttar Pradesh
# For all cities uncomment below lines of code. Keep in mind the output depends on whether the city # names present in dataframe are correct and Google can identify those

# range = length(unique(df.combine$City))
#for (i in range){
#State[i] = Get_State(unique(df.combine$City)[i])
#}

However, since the city data is not that accurate, I choose to ignore the NA

# Dropping NULL

df.combine =df.combine[is.na(df.combine$State)==FALSE,]

unique(df.combine$State)
##  [1] "PB"        "UP"        "GUJ"       "MS"        "OR"       
##  [6] "RAJ"       "WB"        "KNT"       "BHR"       "TELANGANA"
## [11] "KER"       "TN "       "UTT"       "OTHERS"    "MP"       
## [16] "TN"        "HR"        "AS"        "HP"        "AP"       
## [21] "RJ"        "CHATT"     "CHGARH"    "F&V "

Finally, we can summarise which state has the highest sales in terms of quantity

# Summarising by State
df.state =  df.combine %>%  group_by(State) %>% 
            summarise(Quantity_State =sum(Quantity)) %>%
            arrange(desc(Quantity_State)) 
df.state[1,1]
## # A tibble: 1 × 1
##   State
##   <chr>
## 1    MS

So, Maharashtra is there at top. We can now move forward to predict the data for next 30 days. However, before that we can explore a few more things

# Cherry Picking top 5 cities for MS
df.MS_Cities = df.combine %>% group_by(State,City) %>%
              summarise(Quantity=sum(Quantity)) %>%  filter(State == 'MS') %>%
                arrange(desc(Quantity))  %>% 
               head(5)
MS_City_plot = ggplot(df.MS_Cities) + aes(reorder(City,Quantity),weight =Quantity/1000,fill=City) + geom_bar() + coord_flip()    

ggplotly(MS_City_plot)
# Plotting it on map
geo =geocode(df.MS_Cities$City)
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=SOLAPUR&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PIMPALGAON&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=LASALGAON&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PUNE&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=YEOLA&sensor=false
df.MS_Cities = bind_cols(df.MS_Cities,geo)

map=get_map("India",maptype="watercolor",source="stamen",zoom=5)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
## Map from URL : http://tile.stamen.com/watercolor/5/21/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/15.jpg
ggmap(map)

ggmap(map) + geom_point(data=df.MS_Cities,aes(lon,lat,size=Quantity,color=City))

Predicting Price for next 30 days

To graph different price types for these 5 cities. The data is presented in wide format,we have to convert it into tall format

df.MS = df.combine %>%          
        filter(State=='MS',City==df.MS_Cities$City)%>%
        select(Date,PriceMin,PriceMod,PriceMax,City,State) %>% 
        gather('PriceType', 'Value',2:4) %>%
        arrange(City, Date)

Predict= ggplot(df.MS) + aes(Date,Value,color=PriceType) +geom_line() +facet_grid(.~City)

ggplotly(Predict)

We can predict prices for each city within Maharashta, but that would be a little tedious considering we are only predicting at state level.

To overcome above limitation, we will take mean price on a particular date for all cities of MP. This gives a good estimate for the price of onions in Maharashtra as a whole. Similarly, we can predict price for next 30 days period.

df.Predict = df.combine %>%
             filter(State=='MS',PriceMod!=0) %>%
             group_by(Date) %>%
             summarise(State_Price=round(mean(PriceMod))) %>%
             arrange(Date)
colnames(df.Predict) = c('ds','y')
m= prophet(df.Predict)
## Warning in set_auto_seasonalities(m): Disabling yearly seasonality. Run
## prophet with `yearly.seasonality=TRUE` to override this.
## Initial log joint probability = -10.7083
## Optimization terminated normally: 
##   Convergence detected: relative gradient magnitude is below tolerance
future = make_future_dataframe(m,period=30)
forecast =predict(m,future)
ggplotly(plot(m,forecast))